Data setup
Load data
counts1 <- read_tsv("../2_alignedData/featureCounts/genes.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.),"Aligned.sortedByCoord.out.bam")) %>%
set_colnames(str_remove(colnames(.),".fq.gz"))
counts2 <- read_tsv("../../20190122_Q96K97_NoStress_RNASeq/2_alignedData/featureCounts/genes_ss.out") %>%
set_colnames(basename(colnames(.))) %>%
set_colnames(str_remove(colnames(.), "_ssAligned.sortedByCoord.out.bam")) %>%
dplyr::select(Geneid, starts_with("W"))
counts <- counts1 %>%
full_join(counts2) %>%
column_to_rownames("Geneid") %>%
set_colnames(metaData$sample[match(colnames(.), metaData$oldName)]) %>%
rownames_to_column("Geneid")
Create DGE List
dgeList <- counts %>%
as.data.frame() %>%
column_to_rownames("Geneid") %>%
DGEList() %>%
calcNormFactors()
# Set group variables
dgeList$samples$group <- colnames(dgeList) %>%
str_extract("201\\d(Q96K97|K97Gfs|PSEN2|SORL1)") %>%
factor(
levels = c(
"2016Q96K97", "2016K97Gfs", "2016PSEN2", "2017Q96K97",
"2018PSEN2", "2019Q96K97", "2019SORL1"
)
)
dgeList$samples$sample <- colnames(dgeList)
Data QC
# Perform logical test to see how many genes were not detected in dataset
dgeList$counts %>%
rowSums() %>%
is_greater_than(0) %>%
table()
## .
## FALSE TRUE
## 3840 28217
# Check for genes having >= 3 samples with cpm > 1
dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(3) %>%
table()
## .
## FALSE TRUE
## 12525 19532
# Create logical vector of genes to keep that fit criteria
genes2keep <- dgeList %>%
cpm() %>%
is_greater_than(1) %>%
rowSums() %>%
is_weakly_greater_than(3)
# Create new DGEList of genes fitting criteria
dgeFilt <- dgeList[genes2keep,, keep.lib.sizes = FALSE] %>%
calcNormFactors()
# Compare distributions of the DGELists before and after filtering
par(mfrow = c(1,2))
dgeList %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "Before Filtering")
dgeFilt %>%
cpm(log = TRUE) %>%
plotDensities(legend = FALSE, main = "After Filtering")

par(mfrow = c(1,1))
Library size check
# Check library sizes with box plot
dgeFilt$samples %>%
ggplot(aes(group, lib.size, fill = group)) +
geom_boxplot() +
scale_y_continuous(labels = comma) +
labs(x = "Family", y = "Library Size") +
theme(legend.position = "none")

PCA
Setup
# Assess cpm values to make sure PCA results are not heavily skewed by highly expressed genes
pca <- dgeFilt %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pca)$importance %>% pander()
| Standard deviation |
61.42 |
46.8 |
34.09 |
22.69 |
21.84 |
19.87 |
16.37 |
14.65 |
13.58 |
13.27 |
12.15 |
11.34 |
10.68 |
10.14 |
10.05 |
9.823 |
9.487 |
9.19 |
8.942 |
8.821 |
8.754 |
8.56 |
8.467 |
8.128 |
7.975 |
7.883 |
7.597 |
7.263 |
6.84 |
8.673e-14 |
| Proportion of Variance |
0.3446 |
0.2001 |
0.1061 |
0.04702 |
0.04357 |
0.03606 |
0.02448 |
0.0196 |
0.01685 |
0.01609 |
0.01348 |
0.01175 |
0.01042 |
0.00939 |
0.00923 |
0.00881 |
0.00822 |
0.00771 |
0.0073 |
0.00711 |
0.007 |
0.00669 |
0.00655 |
0.00603 |
0.00581 |
0.00568 |
0.00527 |
0.00482 |
0.00427 |
0 |
| Cumulative Proportion |
0.3446 |
0.5446 |
0.6508 |
0.6978 |
0.7414 |
0.7774 |
0.8019 |
0.8215 |
0.8384 |
0.8545 |
0.8679 |
0.8797 |
0.8901 |
0.8995 |
0.9087 |
0.9175 |
0.9258 |
0.9335 |
0.9408 |
0.9479 |
0.9549 |
0.9616 |
0.9681 |
0.9741 |
0.98 |
0.9856 |
0.9909 |
0.9957 |
1 |
1 |
# Create function for plotting PCA
ggPCA <- function(x, y){
pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c(x, y))) %>%
left_join(dgeFilt$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
x, y, colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0(x, " (", percent(summary(pca)$importance[2, x]), ")")) +
ylab(label = paste0(y, " (", percent(summary(pca)$importance[2, y]), ")")) +
theme_bw()
}
Plots
ggPCA("PC1", "PC2")

# Set up grid for visualisation of 3 PCA's at once.
vp1 <- viewport(x = 0.25, y = 0.75, width = 0.5, height = 0.5)
vp2 <- viewport(x = 0.75, y = 0.75, width = 0.5, height = 0.5)
vp3 <- viewport(x = 0.25, y = 0.25, width = 0.5, height = 0.5)
# Plot PCA plots in grid
if (interactive()) grid::grid.newpage()
print(
ggPCA("PC2", "PC3") +
theme(legend.position = "none"),
vp = vp1
)
print(
ggPCA("PC3", "PC4") +
theme(legend.position = "none"),
vp = vp2
)
print(
ggPCA("PC4", "PC5") +
guides(
colour = guide_legend(ncol = 2),
shape = guide_legend(ncol = 3)
) +
theme(legend.position = c(1.6, 0.5)),
vp = vp3
)

Differential Expression
Gene labeller
# Set up gene labeller for easy conversion
geneLabeller <- as_labeller(
structure(genesGR$gene_name, names = genesGR$gene_id)
)
Voom
voomData <- voom(dgeFilt, design = matrix(1, nrow = ncol(dgeFilt)))
pcaDesign <- cbind(Intercept = 1, pca$x[,1:6])
pcaFit <- lmFit(voomData, pcaDesign) %>%
eBayes
# topTable of PC1
topTablePC1 <- topTable(pcaFit, coef = "PC1", n = Inf) %>%
as_tibble() %>%
set_colnames(str_remove_all(colnames(.), "ID.")) %>%
unite("Range", start, end, sep = "-") %>%
unite("Location", seqnames, Range, strand, sep = ":") %>%
dplyr::select(
gene_id, gene_name, AveExpr, logFC,
P.Value, adj.P.Val, t, Location, entrezid
)
Just grab one or two to test:
voomData$E[rownames(topTable(pcaFit, coef = "PC1", n = Inf))[1:10],] %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("sample") %>%
as_tibble() %>%
gather("gene_id", "logCPM", -sample) %>%
left_join(voomData$targets) %>%
left_join(metaData, by = "sample") %>%
cbind(pca$x[.$sample,]) %>%
ggplot(aes(PC1, logCPM, colour = group, shape = provider, label = sex)) +
geom_point() +
facet_wrap(~gene_id, labeller = geneLabeller)

Gene length and GC content
Setup
# # This is extremely slow, takes approximately 30 minutes.
# # Saved as .rds file to save time
# id <- rownames(dgeFilt)
# gcContent <- getGeneLengthAndGCContent(id, org = "dre", mode = "biomart")
# saveRDS(gcContent, "../files/gcContent.rds")
geneGCLen <- readRDS("../files/gcContent.rds") %>%
as.data.frame() %>%
rownames_to_column("gene_id") %>%
as_tibble()
# Get transcript lengths and choose longest transcript to be gene length
transLen <- transcriptLengths(ensDb) %>% as_tibble()
geneLen <- transLen %>%
arrange(gene_id, desc(tx_len)) %>%
distinct(gene_id, .keep_all = TRUE)
Gene Length
Initial inspection
topTablePC1 %>%
mutate(rank = seq_len(nrow(.))) %>%
left_join(geneGCLen) %>%
ggplot(aes(rank, length)) +
geom_point() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE)

topTablePC1 %>%
mutate(rank = seq_len(nrow(.))) %>%
left_join(geneGCLen) %>%
ggplot(aes(t, length)) +
geom_point() +
scale_y_log10() +
geom_smooth(method = "lm", se = FALSE)

Length comparison
# Grab DE down genes and join lengths
# Outliers were also removed to make boxplots look nicer (only 7 genes)
deDown <- topTablePC1 %>%
dplyr::filter(adj.P.Val < 0.05, logFC < 0) %>%
dplyr::select(gene_id, gene_name) %>%
dplyr::mutate(group = "down")
# Grab DE up genes and join lengths
deUp <- topTablePC1 %>%
dplyr::filter(adj.P.Val < 0.05, logFC > 0) %>%
dplyr::select(gene_id, gene_name) %>%
dplyr::mutate(group = "up")
# Grab non DE genes and join lengths
deNon <- topTablePC1 %>%
dplyr::filter(adj.P.Val > 0.05) %>%
dplyr::select(gene_id, gene_name) %>%
dplyr::mutate(group = "non")
# Join into one tibble and box plot to compare
deDown %>%
full_join(deUp) %>%
full_join(deNon) %>%
left_join(geneGCLen) %>%
ggplot(aes(group, length, fill = group)) +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(labels = comma) +
labs(x = "DE", y = "Gene Length") +
coord_cartesian(ylim = c(0, 10000)) +
theme(legend.position = "none")

LogFC vs length
topTablePC1 %>%
left_join(geneGCLen) %>%
mutate(logFC = logFC^(1/3)) %>%
ggplot(aes(length, logFC, colour = gc)) +
geom_point(alpha = 0.5) +
scale_color_gradient2(low = "black", mid = "red", high = "yellow") +
scale_x_log10(labels = comma) +
xlab("Length (bp)") +
ylab("LogFC")

GC content
GC content comparison
deDown %>%
full_join(deUp) %>%
full_join(deNon) %>%
left_join(geneGCLen) %>%
ggplot(aes(group, gc, fill = group)) +
geom_boxplot() +
labs(x = "DE", y = "GC Content") +
theme(legend.position = "none")

LogFC vs GC content
topTablePC1 %>%
left_join(geneGCLen) %>%
ggplot(aes(gc, logFC)) +
geom_point() +
scale_x_continuous(label = scales::percent_format(accuracy = 1)) +
xlab("GC content") +
ylab("LogFC")

Gene level variance
# Create function for isolating columns per family and calculating variance
famVar <- function(x){
dgeFilt %>%
cpm() %>%
as.data.frame() %>%
dplyr::select(str_subset(colnames(.), paste0(x, "_Wt\\d"))) %>%
apply(1, var) %>%
as_tibble(rownames = "geneid") %>%
dplyr::mutate(group = x)
}
# Calculate variance for each family
var1 <- famVar("2016Q96K97")
var2 <- famVar("2016K97Gfs")
var3 <- famVar("2016PSEN2")
var4 <- famVar("2017Q96K97")
var5 <- famVar("2018PSEN2")
var6 <- famVar("2019Q96K97")
var7 <- famVar("2019SORL1")
# Plot boxplot
var1 %>%
full_join(var2) %>%
full_join(var3) %>%
full_join(var4) %>%
full_join(var5) %>%
full_join(var6) %>%
full_join(var7) %>%
ggplot(aes(group, value, fill = group)) +
geom_boxplot(outlier.shape = NA) +
labs(x = "Family", y = "Gene level variance", fill = "Provider") +
scale_fill_discrete(
labels = c(
"ACRF", "ACRF", "ACRF", "SAHMRI", "SAHMRI", "Novogene", "Novogene"
)
) +
coord_cartesian(ylim = c(0, 75))

GO analysis
ID conversion
# # Some packages are required to be unloaded before biomaRt can be loaded
# detach("package:EDASeq", unload = TRUE)
# detach("package:ensembldb", unload = TRUE)
# detach("package:GenomicFeatures", unload = TRUE)
# library(biomaRt)
# # Human entrez and ensembl IDs
# humanEntrezEns <- org.Hs.egENSEMBL %>%
# as.data.frame %>%
# set_colnames(c("Hs_entrez", "Hs_ensembl"))
# # Zebrafish ensembl IDs
# zebEns <- org.Dr.egENSEMBL %>%
# as.data.frame %>%
# set_colnames(c("Dr_entrez", "Dr_ensembl"))
# # Create a data.frame to map between human entrezgenes and zebrafish ensembl IDs.
# # BioMart only includes homolog mappings for ensembl IDs which is why we need to
# # retrieve human ensembl IDs, then join to the humanEntrezEns data.frame,
# # in order to get the desired human entrezgenes to zebrafish ensembl ID mapping.
# zebMart <- useMart("ENSEMBL_MART_ENSEMBL", "drerio_gene_ensembl")
# getFromBiomart <- c("ensembl_gene_id", "hsapiens_homolog_ensembl_gene")
# ens2entrez <- getBM(
# getFromBiomart,
# values = unique(zebEns$Dr_ensembl),
# mart = zebMart
# ) %>%
# set_colnames(c("Dr_ensembl", "Hs_ensembl")) %>%
# left_join(humanEntrezEns, by = "Hs_ensembl") %>%
# dplyr::select(-Hs_ensembl) %>%
# dplyr::filter(complete.cases(.)) %>%
# as_tibble()
GO enrichment
# de <- topTablePC1 %>%
# dplyr::filter(adj.P.Val < 0.05) %>%
# dplyr::select(gene_id) %>%
# left_join(ens2entrez, by = c("gene_id" = "Dr_ensembl")) %>%
# dplyr::filter(!is.na(Hs_entrez)) %>%
# .[["Hs_entrez"]] %>%
# unique()
# uv <- topTablePC1 %>%
# dplyr::select(gene_id) %>%
# left_join(ens2entrez, by = c("gene_id" = "Dr_ensembl")) %>%
# dplyr::filter(!is.na(Hs_entrez)) %>%
# .[["Hs_entrez"]] %>%
# unique()
# goResults <- goana(de = de, universe = uv, species = "Hs")
# goResults %>%
# rownames_to_column("GO ID") %>%
# as_tibble() %>%
# dplyr::filter(DE > 1) %>%
# dplyr::arrange(P.DE) %>%
# mutate(FDR = p.adjust(P.DE, "fdr")) %>%
# dplyr::filter(FDR < 0.05) %>%
# mutate(`GO ID` = str_replace(`GO ID`, ":", "\\\\:")) %>%
# pander(justify = "llrrrrr")
Steve’s wizardry
Initial inspection
geneGCLen %>%
cbind(pca$rotation[.$gene_id,]) %>%
as_tibble() %>%
ggplot(aes(length, PC1, colour = gc)) +
geom_point() +
scale_x_log10(labels = comma) +
scale_colour_viridis_c(option = "inferno")

Bin setup
# Create table breaking GC content and gene length into bins with approximately
# equal amounts of genes in each bin, along with their contribution to each
# principal component axis.
nBins <- list(length = 20, gc = 20)
binTable <- geneGCLen %>%
cbind(pca$rotation[.$gene_id,]) %>%
as_tibble() %>%
mutate(
lengthBins = cut(
log(length),
# breaks = nBins$length,
breaks = quantile(
log(length), seq(0, nBins$length)/nBins$length
),
labels = paste0("L", seq_len(nBins$length)),
include.lowest = TRUE
),
gcBins = cut(
gc,
# breaks = nBins$gc,
breaks = quantile(
gc, seq(0, nBins$gc) / nBins$gc
),
labels = paste0("GC", seq_len(nBins$gc)),
include.lowest = TRUE
),
bothBins = paste(lengthBins, gcBins, sep = "_"),
bothBins = as.factor(bothBins)
) %>%
group_by(bothBins) %>%
mutate(n = n()) %>%
ungroup() %>%
dplyr::filter(n > 1) %>%
as_tibble()
# Setup functions to get ranges of bins.
minLen <- function(x){
binTable %>%
dplyr::filter(lengthBins == x) %>%
dplyr::select(length) %>%
min()
}
maxLen <- function(x){
binTable %>%
dplyr::filter(lengthBins == x) %>%
dplyr::select(length) %>%
max()
}
minGC <- function(x){
binTable %>%
dplyr::filter(gcBins == x) %>%
dplyr::select(gc) %>%
min()
}
maxGC <- function(x){
binTable %>%
dplyr::filter(gcBins == x) %>%
dplyr::select(gc) %>%
max()
}
# Put ranges of bins in table.
binRanges <- tibble(
Bin = c(1:20),
minLen = c(minLen("L1"), minLen("L2"), minLen("L3"), minLen("L4"),
minLen("L5"), minLen("L6"), minLen("L7"), minLen("L8"),
minLen("L9"), minLen("L10"), minLen("L11"), minLen("L12"),
minLen("L13"), minLen("L14"), minLen("L15"), minLen("L16"),
minLen("L17"), minLen("L18"), minLen("L19"), minLen("L20")),
maxLen = c(maxLen("L1"), maxLen("L2"), maxLen("L3"), maxLen("L4"),
maxLen("L5"), maxLen("L6"), maxLen("L7"), maxLen("L8"),
maxLen("L9"), maxLen("L10"), maxLen("L11"), maxLen("L12"),
maxLen("L13"), maxLen("L14"), maxLen("L15"), maxLen("L16"),
maxLen("L17"), maxLen("L18"), maxLen("L19"), maxLen("L20")),
minGC = c(minGC("GC1"), minGC("GC2"), minGC("GC3"), minGC("GC4"),
minGC("GC5"), minGC("GC6"), minGC("GC7"), minGC("GC8"),
minGC("GC9"), minGC("GC10"), minGC("GC11"), minGC("GC12"),
minGC("GC13"), minGC("GC14"), minGC("GC15"), minGC("GC16"),
minGC("GC17"), minGC("GC18"), minGC("GC19"), minGC("GC20")),
maxGC = c(maxGC("GC1"), maxGC("GC2"), maxGC("GC3"), maxGC("GC4"),
maxGC("GC5"), maxGC("GC6"), maxGC("GC7"), maxGC("GC8"),
maxGC("GC9"), maxGC("GC10"), maxGC("GC11"), maxGC("GC12"),
maxGC("GC13"), maxGC("GC14"), maxGC("GC15"), maxGC("GC16"),
maxGC("GC17"), maxGC("GC18"), maxGC("GC19"), maxGC("GC20"))
) %>%
mutate(
minGC = round(100*minGC, digits = 0),
maxGC = round(100*maxGC, digits = 0),
minLen = round(minLen / 1000, digits = 1),
maxLen = round(maxLen / 1000, digits = 1)
) %>%
unite("Length", minLen, maxLen, sep = "-") %>%
unite("GC_Content", minGC, maxGC, sep = "-")
binRanges
## # A tibble: 20 x 3
## Bin Length GC_Content
## <int> <chr> <chr>
## 1 1 0.1-0.8 22-33
## 2 2 0.8-1 33-34
## 3 3 1-1.2 34-35
## 4 4 1.2-1.4 35-36
## 5 5 1.4-1.6 36-37
## 6 6 1.6-1.8 37-37
## 7 7 1.8-1.9 37-38
## 8 8 1.9-2.1 38-39
## 9 9 2.1-2.3 39-40
## 10 10 2.3-2.5 40-41
## 11 11 2.5-2.7 41-42
## 12 12 2.7-3 42-42
## 13 13 3-3.3 42-43
## 14 14 3.3-3.6 43-44
## 15 15 3.6-4.1 44-46
## 16 16 4.1-4.6 46-47
## 17 17 4.6-5.3 47-48
## 18 18 5.3-6.1 48-50
## 19 19 6.1-7.5 50-52
## 20 20 7.6-98.6 52-67
PC1
# Fit linear model to explain PC1 by GC content and gene length.
lm_PC1 <- binTable %>%
lm(PC1 ~ 0 + bothBins, data = .)
# FDR adjust for multiple testing.
# Add significance column for filtering insignificant bins.
binFDR_PC1 <- summary(lm_PC1)$coef %>%
as.data.frame() %>%
rownames_to_column("Bin") %>%
as_tibble() %>%
mutate(FDR = p.adjust(`Pr(>|t|)`, "fdr")) %>%
mutate(Bin = str_remove(Bin, "bothBins")) %>%
separate(Bin, into = c("length", "GC")) %>%
dplyr::mutate(
sig = FDR < 0.05,
p = `Pr(>|t|)`
) %>%
dplyr::select(length, GC, sig, FDR, p)
# Plot gene length bins vs GC contents bins showing contribution to PC1
# and significance.
coef(lm_PC1) %>%
enframe() %>%
mutate(name = str_remove(name, "bothBins")) %>%
separate(name, into = c("length", "GC")) %>%
left_join(binFDR_PC1) %>%
dplyr::filter(sig) %>%
mutate(
length = str_remove(length, "L") %>% as.integer(),
GC = str_remove(GC, "GC") %>% as.integer()
) %>%
ggplot(aes(length, GC)) +
geom_point(aes(colour = value, size = -log10(p))) +
scale_x_continuous(
breaks = seq_len(nBins$length),
labels = binRanges$Length
) +
scale_y_continuous(
breaks = seq_len(nBins$gc),
labels = binRanges$GC_Content
) +
scale_size_continuous(guide = FALSE) +
labs(
x = "Gene Length (kb)",
y = "GC Content (%)",
colour = paste("Contribution", "to PC1", sep = "\n"),
size = "Significance"
) +
theme(
panel.grid.minor = element_blank(),
axis.text.x = element_text(
angle = -45, hjust = 0.1, vjust = 0.5
)
)

PC2
# Fit linear model to explain PC2 by GC content and gene length.
lm_PC2 <- binTable %>%
lm(PC2 ~ 0 + bothBins, data = .)
# FDR adjust for multiple testing.
# Add significance column for filtering insignificant bins.
binFDR_PC2 <- summary(lm_PC2)$coef %>%
as.data.frame() %>%
rownames_to_column("Bin") %>%
as_tibble() %>%
mutate(FDR = p.adjust(`Pr(>|t|)`, "fdr")) %>%
mutate(Bin = str_remove(Bin, "bothBins")) %>%
separate(Bin, into = c("length", "GC")) %>%
dplyr::mutate(
sig = FDR < 0.05,
p = `Pr(>|t|)`
) %>%
dplyr::select(length, GC, sig, FDR, p)
# Plot gene length bins vs GC contents bins showing contribution to PC2
# and significance.
coef(lm_PC2) %>%
enframe() %>%
mutate(name = str_remove(name, "bothBins")) %>%
separate(name, into = c("length", "GC")) %>%
left_join(binFDR_PC2) %>%
dplyr::filter(sig) %>%
mutate(
length = str_remove(length, "L") %>% as.integer(),
GC = str_remove(GC, "GC") %>% as.integer()
) %>%
ggplot(aes(length, GC)) +
geom_point(aes(colour = value, size = -log10(p))) +
scale_x_continuous(
breaks = seq_len(nBins$length),
labels = binRanges$Length
) +
scale_y_continuous(
breaks = seq_len(nBins$gc),
labels = binRanges$GC_Content
) +
scale_size_continuous(guide = FALSE) +
labs(
x = "Gene Length (kb)",
y = "GC Content (%)",
colour = paste("Contribution", "to PC2", sep = "\n"),
size = "Significance"
) +
theme(
panel.grid.minor = element_blank(),
axis.text.x = element_text(
angle = -45, hjust = 0.1, vjust = 0.5
)
)

Old POA
# myLmPlot <- function(x){
# par(mfrow = c(2,2))
# plot(x)
# par(mfrow = c(1,1))
# }
We ran an initial model plotting gene-level contributions (i.e. rotations) to PC1 against the two potential predictor variables: log10(length) and gc content.
# geneGCLen %>%
# cbind(pca$rotation[.$gene_id,]) %>%
# as_tibble() %>%
# lm(PC1 ~ log10(length)*gc, data = .) %T>%
# myLmPlot() %>%
# summary() %>%
# pander()
This model clearly violated the assumption of constant variance meaning the basic assumptions for the linear model were not able to be applied. As such, the Box-Cox transformation was applied using an offset of \(\gamma = 1\) and with \(\lambda = 38\), as determined by fitting the profile likelihood for \(\lambda\).
# geneGCLen %>%
# cbind(pca$rotation[.$gene_id,]) %>%
# as_tibble() %>%
# with(
# boxCox((1 + PC1) ~ log(length)*gc,
# lambda = seq(0, 50, length = 50)
# )
# )
The transformed data was then fit and the assumption of constant variance was able to be applied.
# geneGCLen %>%
# cbind(pca$rotation[.$gene_id,]) %>%
# as_tibble() %>%
# mutate(
# bcPC1 = bcnPower(PC1, lambda = 38, gamma = 1)
# ) %>%
# lm(bcPC1 ~ log10(length)*gc, data = .) %T>%
# myLmPlot() %>%
# summary() %>%
# pander()
Whilst not being simply interpretable, this suggests that both GC content and gene length are contributing to the variability detected along PC1.
# geneGCLen %>%
# cbind(pca$rotation[.$gene_id,]) %>%
# as_tibble() %>%
# ggplot(aes(length, PC1, colour = gc)) +
# geom_point() +
# scale_x_log10(labels = comma) +
# scale_colour_viridis_c(option = "inferno")
RUVSeq
Negative controls
# Extract subset of low ranked genes from DE analysis to act as negative
# controls for RUVg procedure
negControl <- topTablePC1 %>%
dplyr::arrange(desc(P.Value)) %>%
.[1:10000,] %>%
.$gene_id
k = 1
# Run RUVSeq for k = 1
RUVg_k1 <- RUVg(dgeFilt$counts, negControl, 1)
PCA
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k1 <- dgeFilt
# Replace with results
dgeFalse_k1$counts <- RUVg_k1$normalizedCounts
# Run PCA function
pcaRUVg_k1 <- dgeFalse_k1 %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k1)$importance %>% pander()
| Standard deviation |
72.83 |
58.38 |
34.92 |
31.2 |
22.51 |
20.29 |
18.02 |
16.1 |
14.5 |
13.32 |
12.75 |
11.73 |
11.32 |
10.51 |
10.07 |
9.816 |
9.607 |
9.404 |
9.043 |
8.955 |
8.695 |
8.467 |
8.41 |
8.167 |
7.929 |
7.774 |
7.628 |
7.268 |
6.667 |
8.439e-14 |
| Proportion of Variance |
0.3673 |
0.2361 |
0.08448 |
0.06742 |
0.03509 |
0.02852 |
0.02248 |
0.01794 |
0.01455 |
0.0123 |
0.01127 |
0.00953 |
0.00887 |
0.00765 |
0.00702 |
0.00667 |
0.00639 |
0.00613 |
0.00566 |
0.00555 |
0.00524 |
0.00497 |
0.0049 |
0.00462 |
0.00435 |
0.00419 |
0.00403 |
0.00366 |
0.00308 |
0 |
| Cumulative Proportion |
0.3673 |
0.6034 |
0.6879 |
0.7553 |
0.7904 |
0.8189 |
0.8414 |
0.8594 |
0.8739 |
0.8862 |
0.8975 |
0.907 |
0.9159 |
0.9235 |
0.9306 |
0.9372 |
0.9436 |
0.9497 |
0.9554 |
0.961 |
0.9662 |
0.9712 |
0.9761 |
0.9807 |
0.985 |
0.9892 |
0.9933 |
0.9969 |
1 |
1 |
# Plot PCA
pcaRUVg_k1$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k1$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k1)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k1)$importance[2, "PC2"]), ")")) +
theme_bw()

k = 2
# Run RUVSeq for k = 2
RUVg_k2 <- RUVg(dgeFilt$counts, negControl, 2)
PCA
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k2 <- dgeFilt
# Replace with results
dgeFalse_k2$counts <- RUVg_k2$normalizedCounts
# Run PCA function
pcaRUVg_k2 <- dgeFalse_k2 %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k2)$importance %>% pander()
| Standard deviation |
72.96 |
57.92 |
31.88 |
22.48 |
20.21 |
18.08 |
16.15 |
14.76 |
13.4 |
12.79 |
11.75 |
11.36 |
10.47 |
10.06 |
9.808 |
9.626 |
9.405 |
8.965 |
8.864 |
8.686 |
8.445 |
8.383 |
8.169 |
7.961 |
7.789 |
7.664 |
7.291 |
6.721 |
2.433 |
1.006e-13 |
| Proportion of Variance |
0.402 |
0.2533 |
0.07674 |
0.03818 |
0.03084 |
0.02469 |
0.01971 |
0.01645 |
0.01356 |
0.01235 |
0.01042 |
0.00975 |
0.00828 |
0.00765 |
0.00727 |
0.007 |
0.00668 |
0.00607 |
0.00593 |
0.0057 |
0.00539 |
0.00531 |
0.00504 |
0.00479 |
0.00458 |
0.00444 |
0.00402 |
0.00341 |
0.00045 |
0 |
| Cumulative Proportion |
0.402 |
0.6553 |
0.7321 |
0.7702 |
0.8011 |
0.8258 |
0.8455 |
0.8619 |
0.8755 |
0.8879 |
0.8983 |
0.908 |
0.9163 |
0.9239 |
0.9312 |
0.9382 |
0.9449 |
0.951 |
0.9569 |
0.9626 |
0.968 |
0.9733 |
0.9783 |
0.9831 |
0.9877 |
0.9921 |
0.9961 |
0.9996 |
1 |
1 |
# Plot PCA
pcaRUVg_k2$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k2$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k2)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k2)$importance[2, "PC2"]), ")")) +
theme_bw()

k = 3
# Run RUVSeq for k = 3
RUVg_k3 <- RUVg(dgeFilt$counts, negControl, 3)
PCA
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k3 <- dgeFilt
# Replace with results
dgeFalse_k3$counts <- RUVg_k3$normalizedCounts
# Run PCA function
pcaRUVg_k3 <- dgeFalse_k3 %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k3)$importance %>% pander()
| Standard deviation |
73.54 |
42.11 |
23.2 |
20.23 |
18.56 |
16.19 |
15.06 |
13.27 |
12.76 |
11.69 |
11.41 |
10.47 |
10.13 |
9.877 |
9.715 |
9.419 |
9.028 |
8.88 |
8.775 |
8.567 |
8.463 |
8.257 |
7.995 |
7.759 |
7.589 |
7.437 |
6.816 |
1.949 |
1.884 |
1.665e-13 |
| Proportion of Variance |
0.5007 |
0.1641 |
0.04982 |
0.03788 |
0.03188 |
0.02428 |
0.02099 |
0.01629 |
0.01507 |
0.01266 |
0.01205 |
0.01015 |
0.0095 |
0.00903 |
0.00874 |
0.00821 |
0.00755 |
0.0073 |
0.00713 |
0.00679 |
0.00663 |
0.00631 |
0.00592 |
0.00557 |
0.00533 |
0.00512 |
0.0043 |
0.00035 |
0.00033 |
0 |
| Cumulative Proportion |
0.5007 |
0.6648 |
0.7146 |
0.7525 |
0.7844 |
0.8087 |
0.8297 |
0.846 |
0.861 |
0.8737 |
0.8858 |
0.8959 |
0.9054 |
0.9144 |
0.9232 |
0.9314 |
0.9389 |
0.9462 |
0.9533 |
0.9601 |
0.9668 |
0.9731 |
0.979 |
0.9846 |
0.9899 |
0.995 |
0.9993 |
0.9997 |
1 |
1 |
# Plot PCA
pcaRUVg_k3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k3$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k3)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k3)$importance[2, "PC2"]), ")")) +
theme_bw()

k = 4
# Run RUVSeq for k = 4
RUVg_k4 <- RUVg(dgeFilt$counts, negControl, 4)
PCA
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k4 <- dgeFilt
# Replace with results
dgeFalse_k4$counts <- RUVg_k4$normalizedCounts
# Run PCA function
pcaRUVg_k4 <- dgeFalse_k4 %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k4)$importance %>% pander()
| Standard deviation |
73.48 |
34.98 |
21.05 |
20.15 |
16.16 |
14.96 |
13.26 |
12.76 |
11.71 |
11.44 |
10.47 |
10.14 |
9.883 |
9.714 |
9.391 |
9.064 |
8.893 |
8.756 |
8.57 |
8.499 |
8.251 |
8.01 |
7.79 |
7.61 |
7.405 |
6.78 |
1.827 |
1.642 |
1.551 |
7.317e-14 |
| Proportion of Variance |
0.5509 |
0.1249 |
0.04523 |
0.04145 |
0.02665 |
0.02285 |
0.01795 |
0.01661 |
0.01399 |
0.01336 |
0.01118 |
0.01049 |
0.00997 |
0.00963 |
0.009 |
0.00838 |
0.00807 |
0.00782 |
0.00749 |
0.00737 |
0.00695 |
0.00655 |
0.00619 |
0.00591 |
0.00559 |
0.00469 |
0.00034 |
0.00028 |
0.00025 |
0 |
| Cumulative Proportion |
0.5509 |
0.6758 |
0.721 |
0.7624 |
0.7891 |
0.8119 |
0.8299 |
0.8465 |
0.8605 |
0.8739 |
0.885 |
0.8955 |
0.9055 |
0.9151 |
0.9241 |
0.9325 |
0.9406 |
0.9484 |
0.9559 |
0.9633 |
0.9702 |
0.9768 |
0.9829 |
0.9889 |
0.9944 |
0.9991 |
0.9995 |
0.9998 |
1 |
1 |
# Plot PCA
pcaRUVg_k4$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k4$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k4)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k4)$importance[2, "PC2"]), ")")) +
theme_bw()

k = 5
# Run RUVSeq for k = 5
RUVg_k5 <- RUVg(dgeFilt$counts, negControl, 5)
PCA
# Create copy of dgeFilt as framework to replace RUVSeq results into
dgeFalse_k5 <- dgeFilt
# Replace with results
dgeFalse_k5$counts <- RUVg_k5$normalizedCounts
# Run PCA function
pcaRUVg_k5 <- dgeFalse_k5 %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRUVg_k5)$importance %>% pander()
| Standard deviation |
73.52 |
27.87 |
20.33 |
16.21 |
14.99 |
13.27 |
12.77 |
11.76 |
11.5 |
10.52 |
10.17 |
9.889 |
9.752 |
9.412 |
9.033 |
8.886 |
8.803 |
8.601 |
8.509 |
8.253 |
7.996 |
7.804 |
7.61 |
7.398 |
6.758 |
1.787 |
1.616 |
1.197 |
1.096 |
8.744e-14 |
| Proportion of Variance |
0.6052 |
0.08694 |
0.04629 |
0.02943 |
0.02515 |
0.0197 |
0.01826 |
0.01547 |
0.0148 |
0.01238 |
0.01157 |
0.01095 |
0.01065 |
0.00992 |
0.00914 |
0.00884 |
0.00868 |
0.00828 |
0.00811 |
0.00763 |
0.00716 |
0.00682 |
0.00648 |
0.00613 |
0.00511 |
0.00036 |
0.00029 |
0.00016 |
0.00013 |
0 |
| Cumulative Proportion |
0.6052 |
0.6921 |
0.7384 |
0.7678 |
0.793 |
0.8127 |
0.8309 |
0.8464 |
0.8612 |
0.8736 |
0.8852 |
0.8961 |
0.9068 |
0.9167 |
0.9258 |
0.9347 |
0.9433 |
0.9516 |
0.9597 |
0.9674 |
0.9745 |
0.9813 |
0.9878 |
0.9939 |
0.9991 |
0.9994 |
0.9997 |
0.9999 |
1 |
1 |
# Plot PCA
pcaRUVg_k5$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k5$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRUVg_k5)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRUVg_k5)$importance[2, "PC2"]), ")")) +
theme_bw()

Animation
anim0 <- pca$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFilt$samples) %>%
left_join(metaData) %>%
mutate(k = "before RUVSeq")
anim1 <- pcaRUVg_k1$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k1$samples) %>%
left_join(metaData) %>%
mutate(
k = "k = 1"
# PC1 = -PC1,
# PC2 = -PC2
)
anim2 <- pcaRUVg_k2$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k2$samples) %>%
left_join(metaData) %>%
mutate(
k = "k = 2"
# PC1 = -PC1,
# PC2 = -PC2
)
anim3 <- pcaRUVg_k3$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k3$samples) %>%
left_join(metaData) %>%
mutate(
k = "k = 3"
# PC1 = -PC1,
# PC2 = -PC2
)
anim4 <- pcaRUVg_k4$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k4$samples) %>%
left_join(metaData) %>%
mutate(
k = "k = 4"
# PC1 = -PC1,
# PC2 = -PC2
)
anim5 <- pcaRUVg_k5$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeFalse_k5$samples) %>%
left_join(metaData) %>%
mutate(
k = "k = 5"
# PC1 = -PC1,
# PC2 = -PC2
)
anim <- anim0 %>%
full_join(anim1) %>%
full_join(anim2) %>%
full_join(anim3) %>%
full_join(anim4) %>%
full_join(anim5)
anim %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
labs(colour = "Family", shape = "Provider") +
xlab(label = "PC1") +
ylab(label = "PC2") +
coord_cartesian(
xlim = c(-150, 150),
ylim = c(-150, 150)
) +
transition_states(
k,
transition_length = 2,
state_length = 3
) +
ease_aes('cubic-in-out') +
ggtitle("Now showing {closest_state}")

Gene removal
Fitting gene length and GC content as predictors of PC1 showed that genes of length bin 0.1-0.8 kb and gene GC content bin 52-67% were the biggest contributors. Therefore, it should be tested if pulling these genes out restores the PCA to represent more what is expected due to biological variation.
# Filter for genes that have short length and high GC content.
# To be used for subsetting filtered dgeList
geneFilter <- geneGCLen %>%
mutate(
lenTest = length > 800,
gcTest = gc < 0.52
) %>%
dplyr::filter(lenTest, gcTest)
# Subset filtered dgeList
dgeRemoved <- dgeFilt[geneFilter$gene_id,]
PCA
# Run PCA function
pcaRemoved <- dgeRemoved %>%
cpm(log = TRUE) %>%
t() %>%
prcomp()
# Quick inspection to check whether first two PCA components capture most of the variability
summary(pcaRemoved)$importance %>% pander()
| Standard deviation |
51.6 |
38.04 |
29.91 |
21.36 |
19.88 |
18.78 |
15.45 |
13.65 |
12.67 |
12.4 |
11.4 |
10.63 |
9.996 |
9.513 |
9.435 |
9.22 |
8.775 |
8.621 |
8.292 |
8.231 |
8.205 |
7.947 |
7.842 |
7.635 |
7.445 |
7.311 |
7.082 |
6.729 |
6.454 |
7.458e-14 |
| Proportion of Variance |
0.3193 |
0.1735 |
0.1073 |
0.05473 |
0.0474 |
0.0423 |
0.02861 |
0.02235 |
0.01925 |
0.01844 |
0.01559 |
0.01355 |
0.01198 |
0.01085 |
0.01067 |
0.01019 |
0.00923 |
0.00891 |
0.00824 |
0.00812 |
0.00807 |
0.00757 |
0.00737 |
0.00699 |
0.00665 |
0.00641 |
0.00601 |
0.00543 |
0.005 |
0 |
| Cumulative Proportion |
0.3193 |
0.4928 |
0.6001 |
0.6548 |
0.7022 |
0.7445 |
0.7731 |
0.7955 |
0.8147 |
0.8331 |
0.8487 |
0.8623 |
0.8743 |
0.8851 |
0.8958 |
0.906 |
0.9152 |
0.9241 |
0.9324 |
0.9405 |
0.9486 |
0.9561 |
0.9635 |
0.9705 |
0.9771 |
0.9836 |
0.9896 |
0.995 |
1 |
1 |
# Plot PCA
pcaRemoved$x %>%
as.data.frame() %>%
rownames_to_column("sample") %>%
as_tibble() %>%
dplyr::select(sample, one_of(c("PC1", "PC2"))) %>%
left_join(dgeRemoved$samples) %>%
left_join(metaData) %>%
ggplot(
aes_string(
"PC1", "PC2", colour = "group", shape = "provider", label = "sex"
)
) +
geom_point(size = 3, stroke = 0.5) +
# geom_text_repel(show.legend = FALSE) +
labs(colour = "Family", shape = "Provider") +
xlab(label = paste0("PC1", " (", percent(summary(pcaRemoved)$importance[2, "PC1"]), ")")) +
ylab(label = paste0("PC2", " (", percent(summary(pcaRemoved)$importance[2, "PC2"]), ")")) +
theme_bw()

This was also tried with gc < 0.44, but very little difference between the two PCA’s was observed.